This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
#Import Libraries
library(aplore3)
library(caret)
library(dplyr)
library(tidyverse)
library(car)
require(ggthemes)
library(glmnet)
library(cowplot)
library(GGally)
library(ResourceSelection)
library(ROCR)
library(pROC)
library(doParallel)
library(qwraps2)
cl <- makePSOCKcluster(7)
registerDoParallel(cl)
#Import Data
data = glow_bonemed
data = data.frame(data)
dim(data)
[1] 500 18
data$sub_id = as.factor(data$sub_id)
data$site_id = as.factor(data$site_id)
data$phy_id = as.factor(data$phy_id)
#Split train and test set
set.seed(66)
trainIndex <- createDataPartition(data$fracture, p = .8,
list = FALSE,
times = 1)
train <- data[trainIndex,]
test <- data[-trainIndex,]
dim(train)
[1] 400 18
dim(test)
[1] 100 18
All of the EDA will be done in the train data
View head of dataframe
head(train)
Looking at Fracture balance
# look for class imbalance
# The dataset is hevaily imbalance with more No's than Yes
data_classes = data %>% ggplot(aes(x=fracture)) + geom_bar() + theme_fivethirtyeight()
train_classes = train %>% ggplot(aes(x=fracture)) + geom_bar() + theme_fivethirtyeight()
test_classes = test %>% ggplot(aes(x=fracture)) + geom_bar() + theme_fivethirtyeight()
plot_grid(data_classes, train_classes, test_classes, labels = c("Overall Data", "Train Data", "Test Data"))
Let’s look at pair plots from all the numeric variables
train_numeric = train %>% select_if(is.numeric)
pairs(train[,2:8],col=as.factor(train$fracture))
Looking at a different view of pair plots for numerical variables. Excluding id’s
ggpairs(train,columns=4:8,aes(colour=fracture))
Looking at box plot for different numerical variables per fracture or not
boxplot_age = train %>% ggplot(aes(y=age, x=fracture)) + geom_boxplot() + ggtitle("age vs fracture") + theme_fivethirtyeight()
boxplot_weight = train %>% ggplot(aes(y=weight, x=fracture)) + geom_boxplot() + ggtitle("weight vs fracture") + theme_fivethirtyeight()
boxplot_height = train %>% ggplot(aes(y=height, x=fracture)) + geom_boxplot() + ggtitle("height vs fracture") + theme_fivethirtyeight()
boxplot_bmi= train %>% ggplot(aes(y=bmi, x=fracture)) + geom_boxplot() + ggtitle("bmi vs fracture") + theme_fivethirtyeight()
boxplot_fracscore= train %>% ggplot(aes(y=fracscore, x=fracture)) + geom_boxplot() + ggtitle("bmi vs fracture") + theme_fivethirtyeight()
plot_grid(boxplot_age, boxplot_weight, boxplot_height, boxplot_bmi, boxplot_fracscore, nrow=2, ncol=2)
Lets look at bmi vs age per different categorical variables
# relation of bmi and age
age_bim_fracture = train %>% ggplot(aes(x=age, y=bmi, col=fracture)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
age_bim_premeno = train %>% ggplot(aes(x=age, y=bmi, col=premeno)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
age_bim_smoke = train %>% ggplot(aes(x=age, y=bmi, col=raterisk)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
age_bim_raterisk = train %>% ggplot(aes(x=age, y=bmi, col=smoke)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
plot_grid(age_bim_fracture, age_bim_premeno,age_bim_smoke, age_bim_raterisk, nrow=4, ncol=1)
Lets look at different numerica variables vs categorical variables per site id The point os to investigate if site id had any impact
bmi_frac_type = train %>% ggplot(aes(x=fracture, y=bmi, col=as.factor(site_id))) + geom_boxplot() + ggtitle("BMI for fracture type per site id")
age_frac_type = train %>% ggplot(aes(x=fracture, y=age, col=as.factor(site_id))) + geom_boxplot() + ggtitle("Age for fracture type per site id")
weight_frac_type = train %>% ggplot(aes(x=fracture, y=weight, col=as.factor(site_id))) + geom_boxplot() + ggtitle("Weight for fracture type per site id")
height_frac_type = train %>% ggplot(aes(x=fracture, y=height, col=as.factor(site_id))) + geom_boxplot() + ggtitle("Height for fracture type per site id")
plot_grid(bmi_frac_type, age_frac_type, weight_frac_type,height_frac_type, nrow=2, ncol=2)
colnames(train)
[1] "sub_id" "site_id" "phy_id" "priorfrac" "age" "weight" "height" "bmi" "premeno" "momfrac" "armassist"
[12] "smoke" "raterisk" "fracscore" "fracture" "bonemed" "bonemed_fu" "bonetreat"
options(qwraps2_markup = "markdown")
our_summary1 <-
list("Age" =
list("min" = ~ min(age),
"median" = ~ median(age),
"max" = ~ max(age),
"mean (sd)" = ~ qwraps2::mean_sd(age)),
"Weight" =
list("min" = ~ min(weight),
"median" = ~ median(weight),
"max" = ~ max(weight),
"mean (sd)" = ~ qwraps2::mean_sd(weight)),
"Height" =
list("min" = ~ min(height),
"median" = ~ median(height),
"max" = ~ max(height),
"mean (sd)" = ~ qwraps2::mean_sd(height)),
"BMI" =
list("min" = ~ min(bmi),
"median" = ~ median(bmi),
"max" = ~ max(bmi),
"mean (sd)" = ~ qwraps2::mean_sd(bmi)),
"Frac Score" =
list("min" = ~ min(fracscore),
"median" = ~ median(fracscore),
"max" = ~ max(fracscore),
"mean (sd)" = ~ qwraps2::mean_sd(fracscore))
)
summary_table(train, our_summary1)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
| train (N = 400) | |
|---|---|
| Age | Â Â |
| Â Â min | 55 |
| Â Â median | 67 |
| Â Â max | 90 |
|   mean (sd) | 68.64 ± 8.98 |
| Weight | Â Â |
| Â Â min | 39.9 |
| Â Â median | 68 |
| Â Â max | 124.7 |
|   mean (sd) | 71.67 ± 16.28 |
| Height | Â Â |
| Â Â min | 134 |
| Â Â median | 162 |
| Â Â max | 199 |
|   mean (sd) | 161.53 ± 6.40 |
| BMI | Â Â |
| Â Â min | 14.87637 |
| Â Â median | 26.36709 |
| Â Â max | 49.08241 |
|   mean (sd) | 27.44 ± 5.91 |
| Frac Score | Â Â |
| Â Â min | 0 |
| Â Â median | 4 |
| Â Â max | 11 |
|   mean (sd) | 3.73 ± 2.50 |
our_summary1 <-
list("Age" =
list("min" = ~ min(age),
"median" = ~ median(age),
"max" = ~ max(age),
"mean (sd)" = ~ qwraps2::mean_sd(age)),
"Weight" =
list("min" = ~ min(weight),
"median" = ~ median(weight),
"max" = ~ max(weight),
"mean (sd)" = ~ qwraps2::mean_sd(weight)),
"Height" =
list("min" = ~ min(height),
"median" = ~ median(height),
"max" = ~ max(height),
"mean (sd)" = ~ qwraps2::mean_sd(height)),
"BMI" =
list("min" = ~ min(bmi),
"median" = ~ median(bmi),
"max" = ~ max(bmi),
"mean (sd)" = ~ qwraps2::mean_sd(bmi)),
"Frac Score" =
list("min" = ~ min(fracscore),
"median" = ~ median(fracscore),
"max" = ~ max(fracscore),
"mean (sd)" = ~ qwraps2::mean_sd(fracscore))
)
summary_table(dplyr::group_by(train,fracture), our_summary1)
| No (N = 300) | Yes (N = 100) | |
|---|---|---|
| Age | Â Â | Â Â |
| Â Â min | 55 | 56 |
| Â Â median | 66 | 72 |
| Â Â max | 90 | 89 |
|   mean (sd) | 67.61 ± 8.84 | 71.72 ± 8.74 |
| Weight | Â Â | Â Â |
| Â Â min | 39.9 | 45.8 |
| Â Â median | 68 | 69.15 |
| Â Â max | 120.2 | 124.7 |
|   mean (sd) | 71.89 ± 16.53 | 71.02 ± 15.60 |
| Height | Â Â | Â Â |
| Â Â min | 148 | 134 |
| Â Â median | 162 | 160 |
| Â Â max | 199 | 178 |
|   mean (sd) | 162.09 ± 6.05 | 159.85 ± 7.10 |
| BMI | Â Â | Â Â |
| Â Â min | 14.87637 | 18.36349 |
| Â Â median | 26.30405 | 26.4308 |
| Â Â max | 49.08241 | 44.03628 |
|   mean (sd) | 27.32 ± 5.94 | 27.79 ± 5.81 |
| Frac Score | Â Â | Â Â |
| Â Â min | 0 | 0 |
| Â Â median | 3 | 5 |
| Â Â max | 11 | 9 |
|   mean (sd) | 3.37 ± 2.44 | 4.81 ± 2.39 |
NA
options(qwraps2_markup = "markdown")
our_summary1 <-
list("Age" =
list("min" = ~ min(age),
"median" = ~ median(age),
"max" = ~ max(age),
"mean (sd)" = ~ qwraps2::mean_sd(age)),
"Weight" =
list("min" = ~ min(weight),
"median" = ~ median(weight),
"max" = ~ max(weight),
"mean (sd)" = ~ qwraps2::mean_sd(weight)),
"Height" =
list("min" = ~ min(height),
"median" = ~ median(height),
"max" = ~ max(height),
"mean (sd)" = ~ qwraps2::mean_sd(height)),
"BMI" =
list("min" = ~ min(bmi),
"median" = ~ median(bmi),
"max" = ~ max(bmi),
"mean (sd)" = ~ qwraps2::mean_sd(bmi))
)
summary_table(train, our_summary1)
| train (N = 400) | |
|---|---|
| Age | Â Â |
| Â Â min | 55 |
| Â Â median | 67 |
| Â Â max | 90 |
|   mean (sd) | 68.64 ± 8.98 |
| Weight | Â Â |
| Â Â min | 39.9 |
| Â Â median | 68 |
| Â Â max | 124.7 |
|   mean (sd) | 71.67 ± 16.28 |
| Height | Â Â |
| Â Â min | 134 |
| Â Â median | 162 |
| Â Â max | 199 |
|   mean (sd) | 161.53 ± 6.40 |
| BMI | Â Â |
| Â Â min | 14.87637 |
| Â Â median | 26.36709 |
| Â Â max | 49.08241 |
|   mean (sd) | 27.44 ± 5.91 |
#Functions
fit_pred = function(model, x, y, m){
if(m=="lasso"){
fit.pred = predict(model, newx = x, type = "response")
}
if(m=="lda"){
fit.pred= predict(model, x, type = "prob")
fit.pred = fit.pred[,2]
}
if(m=="rf"){
fit.pred= predict(model, x, type = "prob")
fit.pred = fit.pred[,2]
}
if(m=="stepwise"){
fit.pred = predict(model, newdata = x, type = "response")
}
return(fit.pred)
}
make_predictions = function(model, x, y, m){
fit.pred = fit_pred(model, x, y, m)
results = prediction(fit.pred, y,
label.ordering=c("No","Yes"))
return(results)
}
classification_metrics = function(cutoff, model, model_type, x, y, m) {
fit.pred = fit_pred(model, x, y, m)
class<-factor(ifelse(fit.pred>cutoff,"Yes","No"),levels=c("No","Yes"))
#Confusion Matrix for Lasso
conf<-table(class,y)
print(paste("Confusion matrix for ", model_type))
print(conf)
precision <- posPredValue(class, y, positive="Yes")
recall <- sensitivity(class, y, positive="Yes")
F1 <- (2 * precision * recall) / (precision + recall)
print(paste("accuracy = ", round(mean(class==y) ,3), sep = ""))
print(paste("precision = ", round(precision ,3), sep = ""))
print(paste("recall = ", round(precision ,3), sep = ""))
print(paste("F1 = ", round(F1 ,3), sep = ""))
}
roc_metrics = function(pred_results){
roc = performance(pred_results, measure = "tpr", x.measure = "fpr")
return(roc)
}
auc_metrics = function(pred_results) {
auc <- performance(pred_results , measure = "auc")
auc <- auc@y.values
return(auc)
}
plot_roc = function (model_type, pred_results,x,y,c, ...) {
roc = roc_metrics(pred_results)
auc = auc_metrics(pred_results)
plot(roc, colorize = c, ...)
abline(a=0, b= 1)
text(x = x, y = y, paste(model_type," AUC = ", round(auc[[1]],3), sep = ""))
}
Lets train an interpretable logistic regression using the lasso technique The point of this model is to be interpretable, meaning no exotic variables such as iteraction terms
str(train)
'data.frame': 400 obs. of 18 variables:
$ sub_id : Factor w/ 500 levels "1","2","3","4",..: 1 2 3 4 5 6 8 9 10 12 ...
$ site_id : Factor w/ 6 levels "1","2","3","4",..: 1 4 6 6 1 5 1 1 4 1 ...
$ phy_id : Factor w/ 127 levels "1","3","7","8",..: 5 89 110 113 23 104 22 4 87 20 ...
$ priorfrac : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 2 2 1 1 ...
$ age : int 62 65 88 82 61 67 82 86 58 56 ...
$ weight : num 70.3 87.1 50.8 62.1 68 ...
$ height : int 158 160 157 160 152 161 153 156 166 167 ...
$ bmi : num 28.2 34 20.6 24.3 29.4 ...
$ premeno : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ momfrac : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
$ armassist : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 2 ...
$ smoke : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 2 ...
$ raterisk : Factor w/ 3 levels "Less","Same",..: 2 2 1 1 2 2 2 2 1 2 ...
$ fracscore : int 1 2 11 5 1 4 7 7 0 3 ...
$ fracture : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ bonemed : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
$ bonemed_fu: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
$ bonetreat : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
train.x <- model.matrix(fracture~ priorfrac + age + weight + height + bmi + premeno + momfrac + armassist + smoke+ raterisk + fracscore + bonemed + bonemed_fu, train)
train.y<-train[,15]
nFolds = 10
set.seed(4)
foldid = sample(rep(seq(nFolds), length.out = nrow(train.x)))
lambdas_to_try <- 10^seq(-5, 5, length.out = 2000)
set.seed(5)
cvfit = cv.glmnet(train.x, train.y,
family = "binomial",
alpha=1,
type.measure = "auc",
lambda = lambdas_to_try,
nfolds = nFolds,
foldid = foldid,
parallel = T
)
plot(cvfit)
coef(cvfit, s = "lambda.min")
16 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 2.75972566
(Intercept) .
priorfracYes 0.20982197
age .
weight .
height -0.03654858
bmi 0.02983405
premenoYes 0.33817019
momfracYes 0.54135015
armassistYes .
smokeYes -0.51648719
rateriskSame 0.08395715
rateriskGreater 0.30842871
fracscore 0.17234464
bonemedYes .
bonemed_fuYes 0.61466291
print("CV AUC:")
[1] "CV AUC:"
cvfit$cvm[which(cvfit$lambda==cvfit$lambda.min)]
[1] 0.7160265
#Optimal penalty
print("Penalty Value:")
[1] "Penalty Value:"
cvfit$lambda.min
[1] 0.008345661
build a final interpretable model based on feature selection and lambda value selected above
#For final model predictions go ahead and refit lasso using entire
#data set
#finalmodel = glmnet(train.x, train.y, family = "binomial",lambda=cvfit$lambda.min)
finalmodel_lasso<-glmnet(train.x, train.y,
family = "binomial",
alpha=1,
lambda = cvfit$lambda.min )
summary(finalmodel_lasso)
Length Class Mode
a0 1 -none- numeric
beta 15 dgCMatrix S4
df 1 -none- numeric
dim 2 -none- numeric
lambda 1 -none- numeric
dev.ratio 1 -none- numeric
nulldev 1 -none- numeric
npasses 1 -none- numeric
jerr 1 -none- numeric
offset 1 -none- logical
classnames 2 -none- character
call 6 -none- call
nobs 1 -none- numeric
coef(finalmodel_lasso)
16 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 2.75904839
(Intercept) .
priorfracYes 0.20973560
age .
weight .
height -0.03654457
bmi 0.02983270
premenoYes 0.33815169
momfracYes 0.54127143
armassistYes .
smokeYes -0.51647430
rateriskSame 0.08411468
rateriskGreater 0.30853430
fracscore 0.17235143
bonemedYes .
bonemed_fuYes 0.61463897
finalmodel_lasso
Call: glmnet(x = train.x, y = train.y, family = "binomial", alpha = 1, lambda = cvfit$lambda.min)
Df %Dev Lambda
1 10 11.6 0.008346
finalmodel<-glm(fracture ~ priorfrac + height + bmi + premeno + momfrac +
smoke + raterisk + raterisk + fracscore +
bonemed_fu
, data=train,family = binomial(link="logit"))
coef(finalmodel)
(Intercept) priorfracYes height bmi premenoYes momfracYes smokeYes rateriskSame rateriskGreater
3.40776944 0.23453865 -0.04445750 0.04098255 0.49051506 0.66485670 -0.81240346 0.28440869 0.51029407
fracscore bonemed_fuYes
0.19484141 0.68517384
confint(finalmodel)
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) -3.354581776 10.335620486
priorfracYes -0.370889267 0.830248373
height -0.086352571 -0.004159104
bmi -0.001765665 0.083791563
premenoYes -0.131522644 1.097160098
momfracYes -0.039143831 1.353228852
smokeYes -2.123132353 0.258978045
rateriskSame -0.341759864 0.923461916
rateriskGreater -0.155008633 1.187028169
fracscore 0.077177118 0.315554062
bonemed_fuYes 0.120048042 1.250983871
summary(finalmodel)
Call:
glm(formula = fracture ~ priorfrac + height + bmi + premeno +
momfrac + smoke + raterisk + raterisk + fracscore + bonemed_fu,
family = binomial(link = "logit"), data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.76344 -0.75450 -0.53216 0.00826 2.27388
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.40777 3.48197 0.979 0.32773
priorfracYes 0.23454 0.30566 0.767 0.44290
height -0.04446 0.02091 -2.126 0.03347 *
bmi 0.04098 0.02174 1.885 0.05945 .
premenoYes 0.49052 0.31216 1.571 0.11610
momfracYes 0.66486 0.35364 1.880 0.06010 .
smokeYes -0.81240 0.59443 -1.367 0.17172
rateriskSame 0.28441 0.32152 0.885 0.37639
rateriskGreater 0.51029 0.34114 1.496 0.13469
fracscore 0.19484 0.06064 3.213 0.00131 **
bonemed_fuYes 0.68517 0.28780 2.381 0.01728 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 449.87 on 399 degrees of freedom
Residual deviance: 395.77 on 389 degrees of freedom
AIC: 417.77
Number of Fisher Scoring iterations: 4
train.x <- model.matrix(fracture~ priorfrac + age + weight + height + bmi + premeno + momfrac + armassist + smoke+ raterisk + fracscore + bonemed + bonemed_fu, train)
train.y<-train[,15]
preds_lasso = make_predictions(finalmodel_lasso, train.x, train.y, "lasso")
plot_roc("Lasso",preds_lasso,0.2,0.7,T)
classification_metrics(0.22, finalmodel_lasso, "Lasso", train.x, train.y, "lasso")
[1] "Confusion matrix for Lasso"
y
class No Yes
No 175 26
Yes 125 74
[1] "accuracy = 0.623"
[1] "precision = 0.372"
[1] "recall = 0.372"
[1] "F1 = 0.495"
vif(finalmodel)
GVIF Df GVIF^(1/(2*Df))
priorfrac 1.378453 1 1.174076
height 1.071086 1 1.034933
bmi 1.139912 1 1.067667
premeno 1.080826 1 1.039628
momfrac 1.078694 1 1.038602
smoke 1.037147 1 1.018404
raterisk 1.174003 2 1.040920
fracscore 1.437724 1 1.199051
bonemed_fu 1.232295 1 1.110088
plot(finalmodel)
lets look at predictions for the lasso model also looking at the roc plot to select the most optimal threhold for classification
hoslem.test(finalmodel$y, fitted(finalmodel), g=10)
Hosmer and Lemeshow goodness of fit (GOF) test
data: finalmodel$y, fitted(finalmodel)
X-squared = 3.2685, df = 8, p-value = 0.9164
There is a large p-value so the test is a fit
test.x <- model.matrix(fracture~ priorfrac + age + weight + height + bmi + premeno + momfrac + armassist + smoke+ raterisk + fracscore + bonemed + bonemed_fu, test)
test.y<-test[,15]
preds_lasso = make_predictions(finalmodel_lasso, test.x, test.y, "lasso")
plot_roc("Lasso",preds_lasso,0.2,0.7,T)
classification_metrics(0.3, finalmodel_lasso, "Lasso", test.x, test.y, "lasso")
[1] "Confusion matrix for Lasso"
y
class No Yes
No 55 15
Yes 20 10
[1] "accuracy = 0.65"
[1] "precision = 0.333"
[1] "recall = 0.333"
[1] "F1 = 0.364"
lets look at model performance metrics
library(leaps)
nvmax = 14
reg_sq=regsubsets(fracture~.-sub_id-site_id-phy_id-bonetreat,data=train, method="seqrep", nvmax=nvmax)
par(mfrow=c(2,2))
cp<-summary(reg_sq)$cp
plot(1:(nvmax),cp,type="l",ylab="CP",xlab="# of predictors")
index<-which(cp==min(cp))
points(index,cp[index],col="red",pch=10)
bics<-summary(reg_sq)$bic
plot(1:(nvmax),bics,type="l",ylab="BIC",xlab="# of predictors")
index<-which(bics==-0.05839447)
points(index,bics[index],col="red",pch=10)
adjr2<-summary(reg_sq)$adjr2
plot(1:(nvmax),adjr2,type="l",ylab="Adjusted R-squared",xlab="# of predictors")
index<-which(adjr2==max(adjr2))
points(index,adjr2[index],col="red",pch=10)
rss<-summary(reg_sq)$rss
plot(1:(nvmax),rss,type="l",ylab="train RSS",xlab="# of predictors")
index<-which(rss==min(rss))
points(index,rss[index],col="red",pch=10)
cbind(CP=summary(reg_sq)$cp,
r2=summary(reg_sq)$rsq,
Adj_r2=summary(reg_sq)$adjr2,
BIC=summary(reg_sq)$bic,
RSS = summary(reg_sq)$rss)
CP r2 Adj_r2 BIC RSS
[1,] 21.250487 0.06228673 0.05993067 -13.741495 70.32850
[2,] 12.832558 0.08569960 0.08109355 -17.864044 68.57253
[3,] 10.601089 0.09520924 0.08835477 -16.054770 67.85931
[4,] 8.301930 0.10487101 0.09580642 -14.357658 67.13467
[5,] 5.502043 0.11565810 0.10443549 -13.215824 66.32564
[6,] 5.136300 0.12097478 0.10755455 -9.636426 65.92689
[7,] 5.395537 0.12488691 0.10925989 -5.429147 65.63348
[8,] 5.468230 0.12921827 0.11140176 -1.422391 65.30863
[9,] 16.696131 0.10847983 0.08790628 13.983760 66.86401
[10,] 7.734269 0.13311511 0.11083015 8.766479 65.01637
[11,] 9.334520 0.13401349 0.10946233 14.343195 64.94899
[12,] 11.028543 0.13470113 0.10787016 20.016911 64.89742
[13,] 13.010891 0.13474080 0.10559995 25.990037 64.89444
[14,] 15.000000 0.13476528 0.10330220 31.970187 64.89260
coef(reg_sq, 8)
(Intercept) weight bmi premenoYes momfracYes smokeYes rateriskGreater fracscore bonemed_fuYes
0.843129090 -0.008740739 0.029712437 0.075965266 0.121425302 -0.115804861 0.068753886 0.036854265 0.132305160
summary.out <- summary(reg_sq)
which.max(summary.out$adjr2)
[1] 8
summary.out$which[8,]
(Intercept) priorfracYes age weight height bmi premenoYes momfracYes armassistYes
TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE
smokeYes rateriskSame rateriskGreater fracscore bonemedYes bonemed_fuYes
TRUE FALSE TRUE TRUE FALSE TRUE
#To deal with the redundamcy, I would throw the cylinder variable out and then see what happens
model.main<-glm(fracture ~raterisk+weight+bmi+premeno+momfrac+fracscore+bonemed_fu+smoke, data=train,family = binomial(link="logit"))
summary(model.main)
Call:
glm(formula = fracture ~ raterisk + weight + bmi + premeno +
momfrac + fracscore + bonemed_fu + smoke, family = binomial(link = "logit"),
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.75847 -0.74349 -0.53853 -0.03179 2.29136
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.78703 0.74016 -5.117 3.11e-07 ***
rateriskSame 0.29070 0.32185 0.903 0.36641
rateriskGreater 0.56026 0.33585 1.668 0.09528 .
weight -0.05048 0.02278 -2.216 0.02671 *
bmi 0.17195 0.06177 2.784 0.00537 **
premenoYes 0.47993 0.31256 1.535 0.12466
momfracYes 0.62776 0.35007 1.793 0.07294 .
fracscore 0.21909 0.05289 4.142 3.44e-05 ***
bonemed_fuYes 0.69337 0.28701 2.416 0.01570 *
smokeYes -0.80144 0.59573 -1.345 0.17853
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 449.87 on 399 degrees of freedom
Residual deviance: 395.89 on 390 degrees of freedom
AIC: 415.89
Number of Fisher Scoring iterations: 4
exp(cbind("Odds ratio" = coef(model.main), confint.default(model.main, level = 0.95)))
Odds ratio 2.5 % 97.5 %
(Intercept) 0.02266271 0.005312358 0.09667991
rateriskSame 1.33736473 0.711685586 2.51311038
rateriskGreater 1.75112184 0.906650867 3.38214830
weight 0.95077157 0.909250707 0.99418848
bmi 1.18761885 1.052203055 1.34046230
premenoYes 1.61596380 0.875750592 2.98182957
momfracYes 1.87340483 0.943300490 3.72060195
fracscore 1.24494529 1.122346773 1.38093575
bonemed_fuYes 2.00044972 1.139784673 3.51101325
smokeYes 0.44868282 0.139588358 1.44221395
vif(model.main)
GVIF Df GVIF^(1/(2*Df))
raterisk 1.136099 2 1.032414
weight 9.291085 1 3.048128
bmi 9.173003 1 3.028697
premeno 1.081246 1 1.039830
momfrac 1.061681 1 1.030379
fracscore 1.091831 1 1.044907
bonemed_fu 1.226947 1 1.107677
smoke 1.037015 1 1.018339
#Residual diagnostics can be obtained using
plot(model.main)
train_select = subset(train, select = c(weight,bmi,premeno,momfrac,fracscore,bonemed_fu,smoke,raterisk,fracture))
preds_step = make_predictions(model.main, train_select,train_select$fracture, "stepwise")
plot_roc("Step",preds_step,0.2,0.8,T)
classification_metrics(0.22, model.main, "Step", train_select,train_select$fracture, "stepwise")
[1] "Confusion matrix for Step"
y
class No Yes
No 184 27
Yes 116 73
[1] "accuracy = 0.642"
[1] "precision = 0.386"
[1] "recall = 0.386"
[1] "F1 = 0.505"
test_select = subset(test, select = c(weight,bmi,premeno,momfrac,fracscore,bonemed_fu,smoke,raterisk,fracture))
preds_step = make_predictions(model.main, test_select,test_select$fracture, "stepwise")
plot_roc("Step",preds_step,0.2,0.8,T)
classification_metrics(0.22, model.main, "Step", test_select,test_select$fracture, "stepwise")
[1] "Confusion matrix for Step"
y
class No Yes
No 47 9
Yes 28 16
[1] "accuracy = 0.63"
[1] "precision = 0.364"
[1] "recall = 0.364"
[1] "F1 = 0.464"
lets look at model performance metrics
plot_roc("Step",preds_step,0.2,0.8,F, col="red")
par(new=T)
plot_roc("Lasso",preds_lasso,0.2,0.7,F, col="blue")
legend(0.7, 0.3,legend = c("Step","Lasso"), fill=c("red","blue"))
Objective 2
# Feature Eng
#Square numerical variables
train$ageSquared = train$age**2
train$weightSquared = train$weight**2
train$heigthSquared = train$height**2
train$bmiSquared = train$bmi**2
#Cubic numerical variables
train$ageCubic = train$age**3
train$weightCubic = train$weight**3
train$heigthCubic = train$height**3
train$bmiCubic= train$bmi**3
#Square numerical variables
test$ageSquared = test$age**2
test$weightSquared = test$weight**2
test$heigthSquared = test$height**2
test$bmiSquared = test$bmi**2
#Cubic numerical variables
test$ageCubic = test$age**3
test$weightCubic = test$weight**3
test$heigthCubic = test$height**3
test$bmiCubic= test$bmi**3
# drop cols
train = subset(train, select = -c(sub_id,site_id,phy_id))
test = subset(test, select = -c(sub_id,site_id,phy_id))
head(train)
# Scale numeric variables
preProcValues = preProcess(train, method = c("scale"))
train_transformed <- predict(preProcValues, train)
test_transformed <- predict(preProcValues, test)
test_transformed
NA
NA
NA
train.x <- model.matrix(fracture~ .*. , train_transformed)
train.y<-train_transformed[,12]
nFolds = 10
set.seed(3)
foldid = sample(rep(seq(nFolds), length.out = nrow(train.x)))
lambdas_to_try <- 10^seq(-3, 5, length.out = 2000)
set.seed(3)
cvfit = cv.glmnet(train.x, train.y,
family = "binomial",
type.measure = "class",
lambda = lambdas_to_try,
nfolds = nFolds,
foldid = foldid)
plot(cvfit)
coef(cvfit, s = "lambda.min")
277 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 2.959400e+00
(Intercept) .
priorfracYes .
age .
weight .
height -2.129608e-01
bmi .
premenoYes .
momfracYes .
armassistYes 1.954138e-01
smokeYes .
rateriskSame .
rateriskGreater .
fracscore 3.059031e-01
bonemedYes .
bonemed_fuYes .
bonetreatYes .
ageSquared .
weightSquared .
heigthSquared .
bmiSquared .
ageCubic .
weightCubic .
heigthCubic .
bmiCubic .
priorfracYes:age .
priorfracYes:weight .
priorfracYes:height .
priorfracYes:bmi .
priorfracYes:premenoYes -1.179081e+00
priorfracYes:momfracYes -1.101638e+00
priorfracYes:armassistYes .
priorfracYes:smokeYes .
priorfracYes:rateriskSame 8.201626e-01
priorfracYes:rateriskGreater 1.109672e+00
priorfracYes:fracscore .
priorfracYes:bonemedYes .
priorfracYes:bonemed_fuYes .
priorfracYes:bonetreatYes -1.715055e+00
priorfracYes:ageSquared .
priorfracYes:weightSquared .
priorfracYes:heigthSquared .
priorfracYes:bmiSquared 1.359998e-01
priorfracYes:ageCubic .
priorfracYes:weightCubic .
priorfracYes:heigthCubic .
priorfracYes:bmiCubic .
age:weight .
age:height .
age:bmi .
age:premenoYes 4.571901e-02
age:momfracYes .
age:armassistYes .
age:smokeYes .
age:rateriskSame .
age:rateriskGreater .
age:fracscore .
age:bonemedYes .
age:bonemed_fuYes .
age:bonetreatYes .
age:ageSquared .
age:weightSquared .
age:heigthSquared .
age:bmiSquared .
age:ageCubic .
age:weightCubic .
age:heigthCubic .
age:bmiCubic .
weight:height .
weight:bmi .
weight:premenoYes .
weight:momfracYes .
weight:armassistYes .
weight:smokeYes .
weight:rateriskSame .
weight:rateriskGreater .
weight:fracscore .
weight:bonemedYes .
weight:bonemed_fuYes .
weight:bonetreatYes .
weight:ageSquared .
weight:weightSquared .
weight:heigthSquared .
weight:bmiSquared .
weight:ageCubic .
weight:weightCubic .
weight:heigthCubic .
weight:bmiCubic .
height:bmi .
height:premenoYes .
height:momfracYes .
height:armassistYes .
height:smokeYes .
height:rateriskSame .
height:rateriskGreater .
height:fracscore .
height:bonemedYes .
height:bonemed_fuYes .
height:bonetreatYes .
height:ageSquared .
height:weightSquared .
height:heigthSquared .
height:bmiSquared .
height:ageCubic .
height:weightCubic .
height:heigthCubic .
height:bmiCubic .
bmi:premenoYes .
bmi:momfracYes .
bmi:armassistYes 1.999989e-02
bmi:smokeYes -4.256769e-02
bmi:rateriskSame .
bmi:rateriskGreater .
bmi:fracscore 4.773642e-03
bmi:bonemedYes .
bmi:bonemed_fuYes .
bmi:bonetreatYes .
bmi:ageSquared .
bmi:weightSquared .
bmi:heigthSquared .
bmi:bmiSquared .
bmi:ageCubic .
bmi:weightCubic .
bmi:heigthCubic .
bmi:bmiCubic .
premenoYes:momfracYes .
premenoYes:armassistYes 4.980359e-01
premenoYes:smokeYes -1.627461e+00
premenoYes:rateriskSame 1.143115e-01
premenoYes:rateriskGreater 3.001823e-01
premenoYes:fracscore .
premenoYes:bonemedYes .
premenoYes:bonemed_fuYes -1.438804e-01
premenoYes:bonetreatYes .
premenoYes:ageSquared 1.153313e-01
premenoYes:weightSquared .
premenoYes:heigthSquared .
premenoYes:bmiSquared .
premenoYes:ageCubic .
premenoYes:weightCubic -2.270677e-01
premenoYes:heigthCubic .
premenoYes:bmiCubic .
momfracYes:armassistYes -1.195593e+00
momfracYes:smokeYes -2.138175e-02
momfracYes:rateriskSame 1.318237e+00
momfracYes:rateriskGreater 2.725283e-01
momfracYes:fracscore .
momfracYes:bonemedYes -2.112374e+00
momfracYes:bonemed_fuYes .
momfracYes:bonetreatYes -2.235617e-14
momfracYes:ageSquared .
momfracYes:weightSquared .
momfracYes:heigthSquared .
momfracYes:bmiSquared .
momfracYes:ageCubic 3.814712e-01
momfracYes:weightCubic .
momfracYes:heigthCubic 4.230337e-02
momfracYes:bmiCubic .
armassistYes:smokeYes .
armassistYes:rateriskSame 4.433623e-01
armassistYes:rateriskGreater -2.161403e-01
armassistYes:fracscore .
armassistYes:bonemedYes .
armassistYes:bonemed_fuYes .
armassistYes:bonetreatYes -1.110935e+00
armassistYes:ageSquared .
armassistYes:weightSquared .
armassistYes:heigthSquared .
armassistYes:bmiSquared .
armassistYes:ageCubic .
armassistYes:weightCubic .
armassistYes:heigthCubic .
armassistYes:bmiCubic .
smokeYes:rateriskSame .
smokeYes:rateriskGreater .
smokeYes:fracscore .
smokeYes:bonemedYes 5.672542e+00
smokeYes:bonemed_fuYes 3.679955e-14
smokeYes:bonetreatYes 1.066654e-15
smokeYes:ageSquared .
smokeYes:weightSquared .
smokeYes:heigthSquared .
smokeYes:bmiSquared -2.009142e-01
smokeYes:ageCubic -2.717168e-01
smokeYes:weightCubic .
smokeYes:heigthCubic .
smokeYes:bmiCubic .
rateriskSame:fracscore .
rateriskGreater:fracscore .
rateriskSame:bonemedYes .
rateriskGreater:bonemedYes .
rateriskSame:bonemed_fuYes 7.436998e-01
rateriskGreater:bonemed_fuYes .
rateriskSame:bonetreatYes .
rateriskGreater:bonetreatYes 3.094663e-01
rateriskSame:ageSquared .
rateriskGreater:ageSquared .
rateriskSame:weightSquared .
rateriskGreater:weightSquared 1.356196e-02
rateriskSame:heigthSquared .
rateriskGreater:heigthSquared .
rateriskSame:bmiSquared .
rateriskGreater:bmiSquared .
rateriskSame:ageCubic -4.552324e-02
rateriskGreater:ageCubic .
rateriskSame:weightCubic -3.051524e-01
rateriskGreater:weightCubic .
rateriskSame:heigthCubic .
rateriskGreater:heigthCubic 1.677445e-02
rateriskSame:bmiCubic .
rateriskGreater:bmiCubic .
fracscore:bonemedYes 3.616891e-01
fracscore:bonemed_fuYes .
fracscore:bonetreatYes .
fracscore:ageSquared .
fracscore:weightSquared .
fracscore:heigthSquared .
fracscore:bmiSquared .
fracscore:ageCubic .
fracscore:weightCubic .
fracscore:heigthCubic 7.546837e-05
fracscore:bmiCubic .
bonemedYes:bonemed_fuYes .
bonemedYes:bonetreatYes .
bonemedYes:ageSquared .
bonemedYes:weightSquared .
bonemedYes:heigthSquared .
bonemedYes:bmiSquared .
bonemedYes:ageCubic .
bonemedYes:weightCubic .
bonemedYes:heigthCubic .
bonemedYes:bmiCubic 4.526232e-01
bonemed_fuYes:bonetreatYes .
bonemed_fuYes:ageSquared .
bonemed_fuYes:weightSquared .
bonemed_fuYes:heigthSquared .
bonemed_fuYes:bmiSquared .
bonemed_fuYes:ageCubic .
bonemed_fuYes:weightCubic -1.189027e+00
bonemed_fuYes:heigthCubic .
bonemed_fuYes:bmiCubic 1.950171e+00
bonetreatYes:ageSquared .
bonetreatYes:weightSquared .
bonetreatYes:heigthSquared .
bonetreatYes:bmiSquared .
bonetreatYes:ageCubic .
bonetreatYes:weightCubic .
bonetreatYes:heigthCubic -1.047053e-01
bonetreatYes:bmiCubic .
ageSquared:weightSquared .
ageSquared:heigthSquared .
ageSquared:bmiSquared .
ageSquared:ageCubic .
ageSquared:weightCubic .
ageSquared:heigthCubic .
ageSquared:bmiCubic .
weightSquared:heigthSquared .
weightSquared:bmiSquared .
weightSquared:ageCubic .
weightSquared:weightCubic -7.799537e-04
weightSquared:heigthCubic .
weightSquared:bmiCubic .
heigthSquared:bmiSquared .
heigthSquared:ageCubic .
heigthSquared:weightCubic .
heigthSquared:heigthCubic .
heigthSquared:bmiCubic .
bmiSquared:ageCubic .
bmiSquared:weightCubic .
bmiSquared:heigthCubic .
bmiSquared:bmiCubic .
ageCubic:weightCubic .
ageCubic:heigthCubic .
ageCubic:bmiCubic .
weightCubic:heigthCubic .
weightCubic:bmiCubic .
heigthCubic:bmiCubic .
print("CV Error Rate:")
[1] "CV Error Rate:"
cvfit$cvm[which(cvfit$lambda==cvfit$lambda.min)]
[1] 0.2275
#Optimal penalty
print("Penalty Value:")
[1] "Penalty Value:"
cvfit$lambda.min
[1] 0.00520425
finalmode_ob2 = glmnet(train.x, train.y,
family = "binomial",
alpha=1,
lambda = cvfit$lambda.min )
summary(finalmode_ob2)
Length Class Mode
a0 1 -none- numeric
beta 276 dgCMatrix S4
df 1 -none- numeric
dim 2 -none- numeric
lambda 1 -none- numeric
dev.ratio 1 -none- numeric
nulldev 1 -none- numeric
npasses 1 -none- numeric
jerr 1 -none- numeric
offset 1 -none- logical
classnames 2 -none- character
call 6 -none- call
nobs 1 -none- numeric
coef(finalmode_ob2)
277 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 2.914960e+00
(Intercept) .
priorfracYes .
age .
weight .
height -2.112334e-01
bmi .
premenoYes .
momfracYes .
armassistYes 2.260572e-01
smokeYes .
rateriskSame .
rateriskGreater .
fracscore 2.997467e-01
bonemedYes .
bonemed_fuYes .
bonetreatYes .
ageSquared .
weightSquared .
heigthSquared .
bmiSquared .
ageCubic .
weightCubic .
heigthCubic .
bmiCubic .
priorfracYes:age .
priorfracYes:weight .
priorfracYes:height .
priorfracYes:bmi .
priorfracYes:premenoYes -1.173120e+00
priorfracYes:momfracYes -1.101617e+00
priorfracYes:armassistYes .
priorfracYes:smokeYes .
priorfracYes:rateriskSame 8.233277e-01
priorfracYes:rateriskGreater 1.113189e+00
priorfracYes:fracscore .
priorfracYes:bonemedYes .
priorfracYes:bonemed_fuYes .
priorfracYes:bonetreatYes -1.713966e+00
priorfracYes:ageSquared .
priorfracYes:weightSquared .
priorfracYes:heigthSquared .
priorfracYes:bmiSquared 1.344424e-01
priorfracYes:ageCubic .
priorfracYes:weightCubic .
priorfracYes:heigthCubic .
priorfracYes:bmiCubic .
age:weight .
age:height .
age:bmi .
age:premenoYes 3.816808e-02
age:momfracYes .
age:armassistYes .
age:smokeYes .
age:rateriskSame .
age:rateriskGreater .
age:fracscore .
age:bonemedYes .
age:bonemed_fuYes .
age:bonetreatYes .
age:ageSquared .
age:weightSquared .
age:heigthSquared .
age:bmiSquared .
age:ageCubic .
age:weightCubic .
age:heigthCubic .
age:bmiCubic .
weight:height .
weight:bmi .
weight:premenoYes .
weight:momfracYes .
weight:armassistYes .
weight:smokeYes .
weight:rateriskSame .
weight:rateriskGreater .
weight:fracscore .
weight:bonemedYes .
weight:bonemed_fuYes .
weight:bonetreatYes .
weight:ageSquared .
weight:weightSquared .
weight:heigthSquared .
weight:bmiSquared .
weight:ageCubic .
weight:weightCubic .
weight:heigthCubic .
weight:bmiCubic .
height:bmi .
height:premenoYes .
height:momfracYes .
height:armassistYes .
height:smokeYes .
height:rateriskSame .
height:rateriskGreater .
height:fracscore .
height:bonemedYes .
height:bonemed_fuYes .
height:bonetreatYes .
height:ageSquared .
height:weightSquared .
height:heigthSquared .
height:bmiSquared .
height:ageCubic .
height:weightCubic .
height:heigthCubic .
height:bmiCubic .
bmi:premenoYes .
bmi:momfracYes .
bmi:armassistYes 1.393926e-02
bmi:smokeYes -3.047955e-02
bmi:rateriskSame .
bmi:rateriskGreater .
bmi:fracscore 6.297772e-03
bmi:bonemedYes .
bmi:bonemed_fuYes .
bmi:bonetreatYes .
bmi:ageSquared .
bmi:weightSquared .
bmi:heigthSquared .
bmi:bmiSquared .
bmi:ageCubic .
bmi:weightCubic .
bmi:heigthCubic .
bmi:bmiCubic .
premenoYes:momfracYes .
premenoYes:armassistYes 4.948762e-01
premenoYes:smokeYes -1.628943e+00
premenoYes:rateriskSame 1.180272e-01
premenoYes:rateriskGreater 3.030984e-01
premenoYes:fracscore .
premenoYes:bonemedYes .
premenoYes:bonemed_fuYes -1.427633e-01
premenoYes:bonetreatYes .
premenoYes:ageSquared 1.290376e-01
premenoYes:weightSquared .
premenoYes:heigthSquared .
premenoYes:bmiSquared .
premenoYes:ageCubic .
premenoYes:weightCubic -2.239824e-01
premenoYes:heigthCubic .
premenoYes:bmiCubic .
momfracYes:armassistYes -1.195943e+00
momfracYes:smokeYes -3.217459e-02
momfracYes:rateriskSame 1.318280e+00
momfracYes:rateriskGreater 2.739763e-01
momfracYes:fracscore .
momfracYes:bonemedYes -2.111143e+00
momfracYes:bonemed_fuYes .
momfracYes:bonetreatYes -5.212441e-16
momfracYes:ageSquared .
momfracYes:weightSquared .
momfracYes:heigthSquared .
momfracYes:bmiSquared .
momfracYes:ageCubic 3.811591e-01
momfracYes:weightCubic .
momfracYes:heigthCubic 4.236837e-02
momfracYes:bmiCubic .
armassistYes:smokeYes .
armassistYes:rateriskSame 4.411151e-01
armassistYes:rateriskGreater -2.196226e-01
armassistYes:fracscore .
armassistYes:bonemedYes .
armassistYes:bonemed_fuYes .
armassistYes:bonetreatYes -1.112681e+00
armassistYes:ageSquared .
armassistYes:weightSquared .
armassistYes:heigthSquared .
armassistYes:bmiSquared .
armassistYes:ageCubic .
armassistYes:weightCubic .
armassistYes:heigthCubic .
armassistYes:bmiCubic .
smokeYes:rateriskSame .
smokeYes:rateriskGreater .
smokeYes:fracscore .
smokeYes:bonemedYes 5.688345e+00
smokeYes:bonemed_fuYes 8.721593e-15
smokeYes:bonetreatYes 1.779917e-16
smokeYes:ageSquared .
smokeYes:weightSquared .
smokeYes:heigthSquared .
smokeYes:bmiSquared -2.178922e-01
smokeYes:ageCubic -2.789669e-01
smokeYes:weightCubic .
smokeYes:heigthCubic .
smokeYes:bmiCubic .
rateriskSame:fracscore .
rateriskGreater:fracscore .
rateriskSame:bonemedYes .
rateriskGreater:bonemedYes .
rateriskSame:bonemed_fuYes 7.404030e-01
rateriskGreater:bonemed_fuYes .
rateriskSame:bonetreatYes .
rateriskGreater:bonetreatYes 3.075943e-01
rateriskSame:ageSquared .
rateriskGreater:ageSquared .
rateriskSame:weightSquared .
rateriskGreater:weightSquared 1.399931e-02
rateriskSame:heigthSquared .
rateriskGreater:heigthSquared .
rateriskSame:bmiSquared .
rateriskGreater:bmiSquared .
rateriskSame:ageCubic -4.525273e-02
rateriskGreater:ageCubic .
rateriskSame:weightCubic -3.052743e-01
rateriskGreater:weightCubic .
rateriskSame:heigthCubic .
rateriskGreater:heigthCubic 1.661660e-02
rateriskSame:bmiCubic .
rateriskGreater:bmiCubic .
fracscore:bonemedYes 3.605574e-01
fracscore:bonemed_fuYes .
fracscore:bonetreatYes .
fracscore:ageSquared .
fracscore:weightSquared .
fracscore:heigthSquared .
fracscore:bmiSquared .
fracscore:ageCubic .
fracscore:weightCubic .
fracscore:heigthCubic 2.845694e-06
fracscore:bmiCubic .
bonemedYes:bonemed_fuYes .
bonemedYes:bonetreatYes .
bonemedYes:ageSquared .
bonemedYes:weightSquared .
bonemedYes:heigthSquared .
bonemedYes:bmiSquared .
bonemedYes:ageCubic .
bonemedYes:weightCubic .
bonemedYes:heigthCubic .
bonemedYes:bmiCubic 4.503189e-01
bonemed_fuYes:bonetreatYes .
bonemed_fuYes:ageSquared .
bonemed_fuYes:weightSquared .
bonemed_fuYes:heigthSquared .
bonemed_fuYes:bmiSquared .
bonemed_fuYes:ageCubic .
bonemed_fuYes:weightCubic -1.217340e+00
bonemed_fuYes:heigthCubic .
bonemed_fuYes:bmiCubic 1.976471e+00
bonetreatYes:ageSquared .
bonetreatYes:weightSquared .
bonetreatYes:heigthSquared .
bonetreatYes:bmiSquared .
bonetreatYes:ageCubic .
bonetreatYes:weightCubic .
bonetreatYes:heigthCubic -1.038970e-01
bonetreatYes:bmiCubic .
ageSquared:weightSquared .
ageSquared:heigthSquared .
ageSquared:bmiSquared .
ageSquared:ageCubic .
ageSquared:weightCubic .
ageSquared:heigthCubic .
ageSquared:bmiCubic .
weightSquared:heigthSquared .
weightSquared:bmiSquared .
weightSquared:ageCubic .
weightSquared:weightCubic -3.981357e-04
weightSquared:heigthCubic .
weightSquared:bmiCubic .
heigthSquared:bmiSquared .
heigthSquared:ageCubic .
heigthSquared:weightCubic .
heigthSquared:heigthCubic .
heigthSquared:bmiCubic .
bmiSquared:ageCubic .
bmiSquared:weightCubic .
bmiSquared:heigthCubic .
bmiSquared:bmiCubic .
ageCubic:weightCubic .
ageCubic:heigthCubic .
ageCubic:bmiCubic .
weightCubic:heigthCubic .
weightCubic:bmiCubic .
heigthCubic:bmiCubic .
finalmodel_lasso
Call: glmnet(x = train.x, y = train.y, family = "binomial", alpha = 1, lambda = cvfit$lambda.min)
Df %Dev Lambda
1 10 11.6 0.008346
#coef(finalmode_ob2)
#confint(finalmode_ob2)
summary(finalmode_ob2)
Length Class Mode
a0 1 -none- numeric
beta 276 dgCMatrix S4
df 1 -none- numeric
dim 2 -none- numeric
lambda 1 -none- numeric
dev.ratio 1 -none- numeric
nulldev 1 -none- numeric
npasses 1 -none- numeric
jerr 1 -none- numeric
offset 1 -none- logical
classnames 2 -none- character
call 6 -none- call
nobs 1 -none- numeric
finalmode_ob2
Call: glmnet(x = train.x, y = train.y, family = "binomial", alpha = 1, lambda = cvfit$lambda.min)
Df %Dev Lambda
1 49 29.07 0.005204
preds_ob2_lr = make_predictions(finalmode_ob2, train.x, train.y,"lasso")
plot_roc("Complex Lasso",preds_ob2_lr,0.2,0.8,T)
classification_metrics(0.30, finalmode_ob2, "Complex LR", train.x, train.y,"lasso")
[1] "Confusion matrix for Complex LR"
y
class No Yes
No 251 32
Yes 49 68
[1] "accuracy = 0.797"
[1] "precision = 0.581"
[1] "recall = 0.581"
[1] "F1 = 0.627"
test.x <- model.matrix(fracture~ .*. , test_transformed)
test.y<-test_transformed[,12]
preds_ob2_lr = make_predictions(finalmode_ob2, test.x, test.y,"lasso")
plot_roc("Complex Lasso",preds_ob2_lr,0.2,0.8,T)
classification_metrics(0.20, finalmode_ob2, "Complex LR", test.x, test.y,"lasso")
[1] "Confusion matrix for Complex LR"
y
class No Yes
No 53 10
Yes 22 15
[1] "accuracy = 0.68"
[1] "precision = 0.405"
[1] "recall = 0.405"
[1] "F1 = 0.484"
train
train_x = subset(train, select=-c(fracture))
train_x
fitControl <- trainControl(
method = 'repeatedcv',
number = 10,
repeats = 10,
savePredictions = 'final',
verboseIter = FALSE,
classProbs = TRUE,
search='grid'
)
tunegrid <- expand.grid(.mtry = (1:15))
rpart_fit = train(fracture ~ ., data=train, method="rf", trControl = fitControl, tuneGrid = tunegrid)
print(rpart_fit)
Random Forest
400 samples
22 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 360, 360, 360, 360, 360, 360, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
1 0.75150 0.02145022
2 0.76450 0.16515999
3 0.76900 0.20634366
4 0.77125 0.22632281
5 0.77025 0.23283625
6 0.77500 0.25137020
7 0.77225 0.24778541
8 0.77525 0.25764832
9 0.77400 0.25685688
10 0.77450 0.26229126
11 0.77075 0.24923312
12 0.77050 0.24824716
13 0.77200 0.26409136
14 0.77200 0.26209103
15 0.76850 0.25053520
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 8.
preds_rf = make_predictions(rpart_fit, train,train$fracture, "rf")
plot_roc("RF",preds_rf,0.2,0.8,T)
classification_metrics(0.5, rpart_fit, "Step", train,train$fracture, "rf")
[1] "Confusion matrix for Step"
y
class No Yes
No 300 0
Yes 0 100
[1] "accuracy = 1"
[1] "precision = 1"
[1] "recall = 1"
[1] "F1 = 1"
preds_rf = make_predictions(rpart_fit, test,test$fracture, "rf")
plot_roc("RF",preds_rf,0.2,0.8,T)
classification_metrics(0.22, rpart_fit, "RF", test,test$fracture, "rf")
[1] "Confusion matrix for RF"
y
class No Yes
No 39 9
Yes 36 16
[1] "accuracy = 0.55"
[1] "precision = 0.308"
[1] "recall = 0.308"
[1] "F1 = 0.416"
#library(MASS)
#library(pheatmap)
#fit.lda <- lda(fracture ~ ., data = train_transformed, CV = TRUE)
#fit.lda
fitControl <- trainControl(
method = 'repeatedcv',
number = 10,
repeats = 10,
savePredictions = 'final',
verboseIter = FALSE,
classProbs = TRUE,
)
lda_fit = train(fracture ~ ., data=train_transformed, method="lda", trControl = fitControl)
print(lda_fit)
Linear Discriminant Analysis
400 samples
22 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 360, 360, 360, 360, 360, 360, ...
Resampling results:
Accuracy Kappa
0.74675 0.1975245
preds_lda = make_predictions(lda_fit, train_transformed,train_transformed$fracture, "lda")
plot_roc("LDA",preds_lda,0.2,0.8,T)
classification_metrics(0.22, lda_fit, "LDA", train_transformed,train_transformed$fracture, "lda")
[1] "Confusion matrix for LDA"
y
class No Yes
No 206 33
Yes 94 67
[1] "accuracy = 0.682"
[1] "precision = 0.416"
[1] "recall = 0.416"
[1] "F1 = 0.513"
preds_lda = make_predictions(lda_fit, test_transformed,test_transformed$fracture, "lda")
plot_roc("LDA",preds_lda,0.2,0.8,T)
classification_metrics(0.22, lda_fit, "LDA", test_transformed,test_transformed$fracture, "lda")
[1] "Confusion matrix for LDA"
y
class No Yes
No 47 11
Yes 28 14
[1] "accuracy = 0.61"
[1] "precision = 0.333"
[1] "recall = 0.333"
[1] "F1 = 0.418"
plot_roc("Baseline Lasso",preds_lasso,0.2,0.85,F, col="red")
par(new=T)
plot_roc("Complex Lasso",preds_ob2_lr,0.2,0.80,F, col="blue")
par(new=T)
plot_roc("Random Forrest",preds_rf,0.2,0.75,F, col="orange")
par(new=T)
plot_roc("LDA",preds_lda,0.2,0.7,F, col="green")
legend(0.7, 0.3,legend = c("Baseline Lasso","Complex Lasso", "Random Forrest", "LDA"), fill=c("red","blue","orange","green"))